import csv
import requests
from lxml import etree

fp=open('./doubanbook.csv','w+',newline='',encoding='utf-8')
writer=csv.writer(fp)
writer.writerow(('书名','超链接','作者','出版社','出版时间','价格','评分','评价'))
#请求头
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
urls=['https://book.douban.com/top250?start={}'.format(str(i)) for i in range(0,100,25)]
geoole={'Cookie': 'bid=CGwcJh4yts4; ap_v=0,6.0; __utmc=30149280; __utmz=30149280.1625994390.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmc=81379588; __utmz=81379588.1625994390.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=30149280.641672736.1625994390.1625994390.1625996602.2; __utmt_douban=1; __utma=81379588.1355094855.1625994390.1625994390.1625996602.2; __utmt=1; _pk_ses.100001.3ac3=*; __utmb=30149280.2.10.1625996602; __utmb=81379588.2.10.1625996602; _pk_id.100001.3ac3=82251eeee80fc41e.1625994389.2.1625996751.1625994389.'}
# urls=['https://book.douban.com/top250?start=0']
page=1
for url in urls:
    html = requests.get(url,headers=headers,cookies=geoole)
    # print(html.text)
    select = etree.HTML(html.text)  #解析网页
    infos = select.xpath('//tr[@class="item"]')  #先大后小
    # print(len(infos))  #判断当前页面是否拿到全部图书信息
    for info in infos:
        # info = infos[0]
        names = info.xpath('td/div/a/@title')[0]
        links = info.xpath('td/div/a/@href')[0]
        bookinfos = info.xpath('td/p/text()')[0] #取第一个元素
        author = bookinfos.split('/')[0]
        publishers=bookinfos.split('/')[-3]
        times=bookinfos.split('/')[-2]
        prices=bookinfos.split('/')[-1]
        grades=info.xpath('td/div/span[2]/text()')[0]
        evaluates=info.xpath('td/div/span[3]/text()')[0]
        writer.writerow((names,links,author,publishers,times,prices,grades,evaluates))
        print(names,links,author,publishers,times,prices,grades,evaluates)
    print('已经爬取第{}页'.format(page))
    page += 1
fp.close()